| object name | description |
|---|---|
pDat |
Phenotype characteristics of the samples |
t_only_miRNA |
Expresssion only data –> Transposed version of raw_miRNA-df |
meta_dat |
Manipulated data |
Loading libraries:
library(dplyr)
library(tidyr)
library(readxl)
library(tidyverse)
library(DT)
library(knitr)
library(kableExtra)
library(ggplot2)
library (plotly)
library(rmarkdown)
library(limma)
library (xlsx)
raw_miRNA <- read_excel("Z:/Nikita/Projects/mirna_fetal_tissues/data/external/magda_cleanedmiRNAdata.xlsx", col_names = FALSE, sheet = "miRNA")Transposing data:
t_raw_mirna <- t(raw_miRNA)
colnames(t_raw_mirna) = t_raw_mirna[1, ]
t_raw_mirna <- t_raw_mirna[-1, ] #matrix
raw_miRNA_df <- as.data.frame(t_raw_mirna)
colnames(raw_miRNA_df)[1] <- "ID"Trying to get a list of only the miRNA names:
## Error: Can't bind data because some arguments have the same name
## Error: Columns `hsa-let-7a-3p`, `hsa-let-7a-5p`, `hsa-let-7a-5p`, `hsa-let-7f-5p`, `hsa-miR-1`, ... must have a unique name
The above error and select shows that there are duplicates of several miRNA names – this will cause problems later on. So, it’s better to have an object/value containing the names as a list which might be of use later.
Using t_raw_mirna where the names are in separate columns as opposed to rows as above in raw_miRNA_df:
## [1] "hsa-let-7a-2-3p" "hsa-let-7a-3p" "hsa-let-7a-3p" "hsa-let-7a-5p"
## [5] "hsa-let-7a-5p" "hsa-let-7a-5p"
This has now given us a variable mi_names which is a list of the names of the miRNAs as they appear in the raw data raw.
pDat <- raw_miRNA_df[ c(1:28) ]
pDat <- head(pDat,-8)
#o_miRNA.df <- raw_miRNA_df [-c(2:28)]
only_miRNA <- as.data.frame(raw_miRNA_df)
only_miRNA <- head (only_miRNA, -8)
only_miRNA <- only_miRNA [-c(1:28)]
t_only_miRNA <- t(only_miRNA)
t_only_miRNA <- as.data.frame(t_only_miRNA)
#Adding Column Names:
id <- pDat %>%
select ("ID")
id <- unlist(id) %>%
as.character(id)
colnames(t_only_miRNA) <- idConverting data from factors to numeric:
Before adding in any character vectors, I’m going to do a numeric calculation to get the mean expression-
To calculate the mean expression of the miRNAs across all 106 samples:
meta_dat <- t_only_miRNA %>%
mutate(Mean_expression = rowMeans(t_only_miRNA))
meta_dat <- meta_dat %>%
select (Mean_expression, everything())However, applying the above step mutate leads to tidyverse dropping the rownames. This is where the earlier mi_names list will come in hnady.
And so, we can either have the row names as actual rownames or as a separate data column.
–> Trying out as actual rownames:
## Error in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed
The above doesn’t work since there are duplicate miRNAs variables since expression was observed more than once from a different genomic location
–> So, adding names as a separate column
meta_dat <- meta_dat %>%
mutate (names = mi_names) #adding miRNA names as a column
meta_dat <- meta_dat %>%
select('names', everything())
#print(as_tibble(t_only_miRNA), n = 5)
paged_table(meta_dat)Chromosome information added in the same manner as above, chr:
chr <- raw_miRNA %>%
select(108)
chr <- tail(chr, -28)
chr <- unlist(chr)
meta_dat <- meta_dat %>%
mutate (chr = chr) #adding chr as a column
meta_dat <- meta_dat %>%
select('chr', 'names', everything())Joining the chr column with the miRNA names column to give unique characters (since as seen before, there are duplicates):
meta_dat <- meta_dat %>%
mutate(row_names = paste(chr, names, sep="_"))
up_rn <- meta_dat %>%
select ("row_names")
up_rn <- unlist(up_rn) %>%
as.character(up_rn)
row.names(meta_dat) <- up_rn## Warning: non-unique values when setting 'row.names': 'chr1_hsa-miR-3116',
## 'chr1_hsa-miR-3118', 'chr1_hsa-miR-3119', 'chr1_hsa-miR-320b', 'chr1_hsa-
## miR-6077', 'chr10_hsa-miR-1254', 'chr10_hsa-miR-3158-3p', 'chr10_hsa-
## miR-3158-5p', 'chr10_hsa-miR-4679', 'chr10_hsa-miR-511-3p', 'chr10_hsa-
## miR-511-5p', 'chr11_hsa-miR-3160-3p', 'chr11_hsa-miR-3160-5p', 'chr12_hsa-
## miR-1244', 'chr12_hsa-miR-3913-3p', 'chr12_hsa-miR-3913-5p', 'chr14_hsa-
## miR-1185-5p', 'chr14_hsa-miR-329-3p', 'chr14_hsa-miR-329-5p', 'chr14_hsa-
## miR-376a-3p', 'chr14_hsa-miR-8071', 'chr15_hsa-miR-1233-3p', 'chr15_hsa-
## miR-1233-5p', 'chr15_hsa-miR-3118', 'chr15_hsa-miR-4509', 'chr15_hsa-miR-5701',
## 'chr15_hsa-miR-7973', 'chr16_hsa-miR-1972', 'chr16_hsa-miR-3179', 'chr16_hsa-
## miR-3180', 'chr16_hsa-miR-3180-3p', 'chr16_hsa-miR-3180-5p', 'chr16_hsa-
## miR-3670', 'chr16_hsa-miR-3680-3p', 'chr16_hsa-miR-3680-5p', 'chr16_hsa-
## miR-6511a-3p', 'chr16_hsa-miR-6511a-5p', 'chr16_hsa-miR-6511b-3p', 'chr16_hsa-
## miR-6511b-5p', 'chr16_hsa-miR-6770-3p', 'chr16_hsa-miR-6770-5p', 'chr16_hsa-
## miR-6862-3p', 'chr16_hsa-miR-6862-5p', 'chr17_hsa-miR-4315', 'chr18_hsa-
## miR-320c', 'chr18_hsa-miR-5583-3p', 'chr18_hsa-miR-5583-5p', 'chr19_hsa-
## miR-1270', 'chr19_hsa-miR-1283', 'chr19_hsa-miR-512-3p', 'chr19_hsa-miR-512-5p',
## 'chr19_hsa-miR-515-3p', 'chr19_hsa-miR-515-5p', 'chr19_hsa-miR-516a-3p',
## 'chr19_hsa-miR-516a-5p', 'chr19_hsa-miR-516b-3p', 'chr19_hsa-miR-516b-5p',
## 'chr19_hsa-miR-517-5p', 'chr19_hsa-miR-518a-3p', 'chr19_hsa-miR-518a-5p',
## 'chr19_hsa-miR-519a-3p', 'chr19_hsa-miR-521', 'chr19_hsa-miR-526a', 'chr2_hsa-
## miR-1302', 'chr2_hsa-miR-3130-3p', 'chr2_hsa-miR-3130-5p', 'chr2_hsa-miR-4435',
## 'chr2_hsa-miR-4436b-3p', 'chr2_hsa-miR-4436b-5p', 'chr2_hsa-miR-4771',
## 'chr2_hsa-miR-4773', 'chr2_hsa-miR-4776-3p', 'chr2_hsa-miR-4776-5p', 'chr20_hsa-
## miR-941', 'chr22_hsa-miR-3199', 'chr4_hsa-miR-3688-3p', 'chr4_hsa-miR-3688-5p',
## 'chr6_hsa-miR-548a-3p', 'chr7_hsa-miR-3914', 'chr7_hsa-miR-4283', 'chr7_hsa-
## miR-4650-3p', 'chr7_hsa-miR-4650-5p', 'chr7_hsa-miR-550a-3p', 'chr7_hsa-
## miR-550a-5p', 'chr7_hsa-miR-550b-2-5p', 'chr7_hsa-miR-550b-3p', 'chr8_hsa-
## miR-124-3p', 'chr8_hsa-miR-124-5p', 'chr8_hsa-miR-3926', 'chr8_hsa-miR-486-3p',
## 'chr8_hsa-miR-486-5p', 'chr8_hsa-miR-7112-3p', 'chr8_hsa-miR-7112-5p',
## 'chr9_hsa-miR-1302', 'chr9_hsa-miR-3689d', 'chr9_hsa-miR-3910', 'chrX_hsa-
## miR-105-3p', 'chrX_hsa-miR-105-5p', 'chrX_hsa-miR-1184', 'chrX_hsa-miR-3202',
## 'chrX_hsa-miR-450a-5p', 'chrX_hsa-miR-4536-3p', 'chrX_hsa-miR-4536-5p',
## 'chrX_hsa-miR-509-3p', 'chrX_hsa-miR-509-5p', 'chrX_hsa-miR-513a-3p', 'chrX_hsa-
## miR-513a-5p', 'chrX_hsa-miR-514a-3p', 'chrX_hsa-miR-514a-5p'
## Error in `.rowNamesDF<-`(x, value = value): duplicate 'row.names' are not allowed
This gives the same error as before –> there are still some duplicates present.
#duplicated(up_rn) #duplicates present in true or false
#unique(up_rn) #unique values present
meta_dat %>%
summarise(count = n_distinct(row_names))## count
## 1 2666
The above gives the number of unique elements in the row_names column.
Meaning that:
Of the 2794 values, 2666 are unique.
i.e. 2794 - 2666 = 128 are duplicates
Removing the duplicated names then:
#dups <-meta_dat %>%
# group_by(row_names) %>%
# filter(n()>1)
meta_dat <- meta_dat %>%
distinct(row_names, .keep_all = TRUE) #removing duplicates based on row_names
up_rn <- meta_dat %>%
select ("row_names")
up_rn <- unlist(up_rn) %>%
as.character(up_rn)
row.names(meta_dat) <- up_rn
paged_table(meta_dat)Mean_expression was rounded to give a whole number r_exp to make it easier for distinguising expression levels:
#frequency table
meta_dat <- meta_dat %>%
mutate (r_exp = round(Mean_expression))
meta_dat <- meta_dat %>%
select ('chr', 'names', 'Mean_expression', 'r_exp', everything())The above steps (i.e. any further data manipulation step) removes the row names. So, we’ll have to leave adding the row names to the last step, just before saving the data. But till then, I’ll make the row_names column the first one so that the data is easier to identify.
3 separate variables were created having the frequencies (no. of miRNAs) according to expression levels, and then put together in a separate df, exp_data
Separating the novel miRNA according to mean expression levels:
1. zero <- Average expression level = 0
2. bet_1_10 <- Average expression levels from 1 to 10
3. more_10 <- Average expression level > 10
## # A tibble: 22 x 2
## r_exp freq
## <dbl> <int>
## 1 0 1604
## 2 1 286
## 3 2 139
## 4 3 81
## 5 4 69
## 6 5 50
## 7 6 75
## 8 7 56
## 9 8 41
## 10 9 37
## # ... with 12 more rows
zero <- meta_dat %>%
filter(r_exp == 0) %>%
summarise(zero_exp=n())
bet_1_10 <- meta_dat %>%
filter(between(r_exp, 1, 10)) %>%
summarise(mid_exp = n())
more_10 <- meta_dat %>%
filter(r_exp > 10) %>%
summarise(more_exp = n())
#zero + bet_1_10 + more_10
exp_data <- bind_cols(zero, bet_1_10, more_10)
rn <- colnames(exp_data)
exp_data <- t(exp_data)
exp_data <- as_tibble(exp_data) %>%
mutate(exp_levels = rn)
colnames(exp_data)[1] <- "number"A bar plot showing the number of novel miRNA according to their mean expression levels was created using
ggplot2 and plotly:
a <-exp_data %>%
mutate(exp_levels = fct_relevel(exp_levels, c('zero_exp', 'mid_exp', 'more_exp'))) %>%
ggplot(aes(x = exp_levels, y = number, fill = exp_levels)) +
geom_bar(stat = "identity", width=0.5, fill="#FFC7EA", colour = "black") +#text = paste0(exp_levels, "Avg expression")) + #default value
geom_text(aes(label = number), nudge_y = 50) +
scale_x_discrete(labels = c('Zero', 'Mid (1-5)', 'High')) +
labs(title="Expression Levels of novel miRNAs", subtitle = "Average expression levels of 2794 miRNAs", x ="Expression Level", y ="No. of miRNA") +
coord_cartesian(ylim=c(0,2000))
a ***
Finally Adding Row Names
eDat <- meta_dat [c(6:111)] #expression only data
write.xlsx2(eDat, file ="Z:/Nikita/Projects/mirna_fetal_tissues/data/processed/miRNA_fetal_tissues_eDat.xlsx", col.names = TRUE, row.names = TRUE, append = FALSE)
write.xlsx2(pDat, file ="Z:/Nikita/Projects/mirna_fetal_tissues/data/processed/miRNA_fetal_tissues_pDat.xlsx", col.names = TRUE, row.names = TRUE, append = FALSE)
write.xlsx2(meta_dat, file ="Z:/Nikita/Projects/mirna_fetal_tissues/data/processed/miRNA_fetal_tissues_meta_dat.xlsx", col.names = TRUE, row.names = TRUE, append = FALSE)
#write.xlsx for smaller files and write.xlsx2 for bigger files